Explore Distribution of CNVs and Chromosome Depth

Author

Claudia Zirión-Martínez

Published

February 13, 2025

Index

Setup

Libraries

Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)
library(ggrepel)

Paths

Code
metadata_path <-
    "data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
chromosomes_path <-
    "../Crypto_Desjardins/results/04.Intermediate_files/03.References/chromosome_lengths.tsv"
depth_by_chrom_good_desj_path <-
    "../Crypto_Desjardins/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
depth_by_chrom_good_ashton_path <-
    "../Crypto_Ashton/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
cnv_calls_path <-
    "../Crypto_Desjardins_Ashton/results/02.Dataset/cnv_0.6/cnv_calls.tsv"
chrom_metrics_out_path <-
    "results/tables/chrom_metrics.tsv"
chrom_metrics_filtered_out_path <- 
    "results/tables/chrom_metrics_filtered.tsv"

Prepare dataset

Metadata

Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99.
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.

Code
metadata <- read.delim(
    metadata_path,
    header=TRUE,
    sep=",",
    stringsAsFactor = TRUE)
metadata <- metadata %>%
    select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
    filter(!strain == "H99") %>%
    group_by(dataset, lineage)%>%
    mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
    ungroup() %>%
    group_by(lineage)%>%
    mutate(samples_in_lineage = n_distinct(sample))%>%
    ungroup()%>%
    mutate(total_samples = n_distinct(sample))%>%
    droplevels()

Chromosome names and length

Get the nice chromosome names from the chromosomes file in WeavePop.

Code
chromosomes = read.delim(
    chromosomes_path,
    header=TRUE, sep=",") %>%
    mutate(chromosome = str_pad(chromosome, 2, pad = "0"))%>%
    mutate(chromosome = as.factor(chromosome))
levels(chromosomes$chromosome) <- paste("chr", chromosomes$chromosome, sep="")

chromosomes <- chromosomes %>%
    arrange(lineage, length)

Lineage VNII is the only one in which chromosome names coincide with the length (chr01 largest, chr14 smallest).

Called CNVs

Import the file with all called CNVs and add to it the chromosome information.

Code
cnv_calls <- read.delim(
    cnv_calls_path, 
    header=TRUE, sep="\t")%>%
    left_join(chromosomes, by="accession")

Create the depth thresholds used to identify deletions and duplications.

depth_threshold <- 0.6
del_threshold <- 1 - depth_threshold
dup_threshold <- 1 + depth_threshold

Create dataframe for the called duplications only dupCNVs.

Code
dup_calls <- filter(cnv_calls, cnv == "duplication")

Filter out high depth dupCNVs

To filter out the dupCNVs that have very large Normalized Depth. Here I make groups of CNVs that start at the same position in the same chromosome.

Code
dup_groups <- dup_calls %>%
    group_by(lineage, chromosome, start, end, region_size) %>%
    summarize(max_depth = max(norm_depth),
              median_depth = median(norm_depth),
              n = n(),
              median_repeat = median(repeat_fraction))

The dupCNVs groups with max depth larger than extreme_depth:

extreme_depth <- 5
Code
large_dups <- dup_groups %>%
                        filter((max_depth > extreme_depth))%>%
                        arrange(desc(n))
head(large_dups)
lineage chromosome start end region_size max_depth median_depth n median_repeat
VNI chr01 339501 350500 11000 9.73 6.140 489 0.00
VNI chr02 274001 281500 7500 115.79 50.645 428 0.63
VNI chr02 274501 281000 6500 100.90 45.920 359 0.57
VNI chr10 582001 587000 5000 6.48 4.770 151 0.51
VNI chr03 1533001 1537000 4000 18.36 6.960 117 0.08
VNI chr01 1 2000 2000 32.38 5.410 108 0.00

Filter out duplications that are part of the groups in the previous table.
This way of filtering removes individual CNVs that not necessarily have depth grater than 5 but other samples in the lineage have a high depth CNV in the same region.

Code
dup_calls_filter_depth <- dup_calls %>%
    filter(!paste(start, end, chromosome, lineage) %in% 
           paste(large_dups$start, large_dups$end, large_dups$chromosome, large_dups$lineage))

Filter out high repeat content CNVs

repeat_threshold = 0.25
Code
repetitive_dups <- dup_groups %>%
                        filter((median_repeat > 0.25))%>%
                        arrange(desc(n))
Code
dup_calls_filtered <- dup_calls_filter_depth %>%
    filter(!paste(start, end, chromosome, lineage) %in% 
           paste(repetitive_dups$start, repetitive_dups$end, 
                repetitive_dups$chromosome, repetitive_dups$lineage))

Chromosome depth

Code
depth_by_chrom_good_desjardins <- read.delim(
    depth_by_chrom_good_desj_path,
     header=TRUE, sep="\t")
depth_by_chrom_good_ashton <- read.delim(
    depth_by_chrom_good_ashton_path,
     header=TRUE, sep="\t")
depth_by_chrom <- rbind(depth_by_chrom_good_desjardins, depth_by_chrom_good_ashton)%>%
    select(sample, accession, norm_chrom_mean, norm_chrom_median)%>%
    left_join(chromosomes, by = "accession")

head(depth_by_chrom)
sample accession norm_chrom_mean norm_chrom_median lineage chromosome length
SRS404518 CP003820.1 0.97 0.98 VNI chr01 2291499
SRS404518 CP003821.1 1.12 0.98 VNI chr02 1621675
SRS404518 CP003822.1 1.01 0.99 VNI chr03 1575141
SRS404518 CP003823.1 0.99 1.00 VNI chr04 1084805
SRS404518 CP003824.1 0.98 0.99 VNI chr05 1814975
SRS404518 CP003825.1 0.99 1.00 VNI chr06 1422463

Combine all information

The following dataframes contain a row per individual called duplication with the information about the CNV and the chromosome of the sample. For the chromosome-sample without any called duplication, the columns related to CNVs have NAs.
all_metrics is created with all dupCNVs. all_metrics_filtered with the dupCNVs filtered by depth and repeat fraction.

Code
all_metrics <- left_join(depth_by_chrom, dup_calls,
        by = c("sample", "lineage", "accession", "chromosome",  "length"))%>%
    left_join(metadata, by = c("sample", "lineage"))
head(all_metrics)
sample accession norm_chrom_mean norm_chrom_median lineage chromosome length start end cnv region_size depth norm_depth smooth_depth repeat_fraction overlap_bp feature_id strain source dataset vni_subdivision samples_in_dataset_lineage samples_in_lineage total_samples
SRS404518 CP003820.1 0.97 0.98 VNI chr01 2291499 NA NA NA NA NA NA NA NA NA NA Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003821.1 1.12 0.98 VNI chr02 1621675 274501 281000 duplication 6500 4049.51 37.85 37.32 0.57 3735 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003822.1 1.01 0.99 VNI chr03 1575141 1533001 1537000 duplication 4000 1246.32 11.64 5.32 0.08 312 CNAG_06925,CNAG_06926 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003823.1 0.99 1.00 VNI chr04 1084805 NA NA NA NA NA NA NA NA NA NA Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003824.1 0.98 0.99 VNI chr05 1814975 NA NA NA NA NA NA NA NA NA NA Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003825.1 0.99 1.00 VNI chr06 1422463 NA NA NA NA NA NA NA NA NA NA Bt134 Clinical Desjardins VNIa-5 185 853 1055
Code
all_metrics_filtered <- left_join(depth_by_chrom, dup_calls_filtered,
        by = c("sample", "lineage", "accession", "chromosome",  "length"))%>%
    left_join(metadata, by = c("sample", "lineage"))

Summarize the metrics of the called CNVs per chromosome

And keep the rest of the chromosome metrics.

Code
chrom_metrics <- all_metrics %>%
    group_by(sample, strain, lineage, 
            accession, chromosome, length,
            source, vni_subdivision, dataset,
            samples_in_lineage, samples_in_dataset_lineage, 
            total_samples, 
            norm_chrom_mean, norm_chrom_median)%>%
    summarise(total_cnv_size = sum(region_size, na.rm = TRUE),
                n_cnvs = sum(!is.na(cnv)), 
                first = min(start),
                last = max(end),
                mean_cnv_depth = round(mean(norm_depth),2),
                median_cnv_depth = round(median(norm_depth),2),
                repeat_size = sum(overlap_bp)) %>%
            ungroup()%>%
    mutate(dup_coverage_percent = round((total_cnv_size / length) * 100, 2),
            dup_span_size = ifelse(is.na(last - first), 0, last - first),
            dup_span_percent = round((dup_span_size / length) * 100, 2),
            dup_repeat_percent = round((repeat_size / length) * 100, 2),
            chromosome = as.factor(chromosome),
            dup_coverage_percent = ifelse(is.na(dup_coverage_percent), 0, dup_coverage_percent),
            dup_span_percent = ifelse(is.na(dup_span_percent), 0, dup_span_percent))
head(chrom_metrics)
sample strain lineage accession chromosome length source vni_subdivision dataset samples_in_lineage samples_in_dataset_lineage total_samples norm_chrom_mean norm_chrom_median total_cnv_size n_cnvs first last mean_cnv_depth median_cnv_depth repeat_size dup_coverage_percent dup_span_size dup_span_percent dup_repeat_percent
ERS1142690 20949_2#1 VNI CP003820.1 chr01 2291499 Clinical VNIa-4 Ashton 853 668 1055 1.00 0.98 11000 1 339501 350500 7.12 7.12 0 0.48 10999 0.48 0.00
ERS1142690 20949_2#1 VNI CP003821.1 chr02 1621675 Clinical VNIa-4 Ashton 853 668 1055 1.20 0.96 6500 1 274501 281000 64.92 64.92 3735 0.40 6499 0.40 0.23
ERS1142690 20949_2#1 VNI CP003822.1 chr03 1575141 Clinical VNIa-4 Ashton 853 668 1055 0.99 0.98 0 0 NA NA NA NA NA 0.00 0 0.00 NA
ERS1142690 20949_2#1 VNI CP003823.1 chr04 1084805 Clinical VNIa-4 Ashton 853 668 1055 1.00 1.00 0 0 NA NA NA NA NA 0.00 0 0.00 NA
ERS1142690 20949_2#1 VNI CP003824.1 chr05 1814975 Clinical VNIa-4 Ashton 853 668 1055 0.97 0.98 0 0 NA NA NA NA NA 0.00 0 0.00 NA
ERS1142690 20949_2#1 VNI CP003825.1 chr06 1422463 Clinical VNIa-4 Ashton 853 668 1055 1.00 1.00 0 0 NA NA NA NA NA 0.00 0 0.00 NA
Code
chrom_metrics_filtered <- all_metrics_filtered %>%
    group_by(sample, strain, lineage, 
            accession, chromosome, length,
            source, vni_subdivision, dataset,
            samples_in_lineage, samples_in_dataset_lineage, 
            total_samples, 
            norm_chrom_mean, norm_chrom_median)%>%
    summarise(total_cnv_size = sum(region_size, na.rm = TRUE),
                n_cnvs = sum(!is.na(cnv)), 
                first = min(start),
                last = max(end),
                mean_cnv_depth = round(mean(norm_depth),2),
                median_cnv_depth = round(median(norm_depth),2),
                repeat_size = sum(overlap_bp)) %>%
            ungroup()%>%
    mutate(dup_coverage_percent = round((total_cnv_size / length) * 100, 2),
            dup_span_size = ifelse(is.na(last - first), 0, last - first),
            dup_span_percent = round((dup_span_size / length) * 100, 2),
            dup_repeat_percent = round((repeat_size / length) * 100, 2),
            chromosome = as.factor(chromosome),
            dup_coverage_percent = ifelse(is.na(dup_coverage_percent), 0, dup_coverage_percent),
            dup_span_percent = ifelse(is.na(dup_span_percent), 0, dup_span_percent))
The total number of chromosomes in all samples in the dataset is: 14770
The total number of chromosomes in all samples with called duplications is: 5058
The total number of chromosomes in all samples with filtered called duplications is: 1790

Write tables

Create file with chrom_metrics without any filtering of original CNVs.
Create file with chrom_metrics_filtered with filters in the original CNVs.

Code
write_tsv(chrom_metrics, chrom_metrics_out_path)
write_tsv(chrom_metrics_filtered, chrom_metrics_filtered_out_path)

Explore Called CNVs

Code
cnv_calls %>%
    group_by(cnv) %>%
    summarize(
        count = n(),
        min_norm_depth = min(norm_depth, na.rm = TRUE),
        mean_norm_depth = mean(norm_depth, na.rm = TRUE),
        q25_norm_depth = quantile(norm_depth, probs = 0.25, na.rm = TRUE),
        median_norm_depth = median(norm_depth, na.rm = TRUE),
        q75_norm_depth = quantile(norm_depth, probs = 0.75, na.rm = TRUE),
        max_norm_depth = max(norm_depth, na.rm = TRUE),
        min_smooth_depth = min(smooth_depth, na.rm = TRUE),
        mean_smooth_depth = mean(smooth_depth, na.rm = TRUE),
        q25_smooth_depth = quantile(smooth_depth, probs = 0.25, na.rm = TRUE),
        median_smooth_depth = median(smooth_depth, na.rm = TRUE),
        q75_smooth_depth = quantile(smooth_depth, probs = 0.75, na.rm = TRUE),
        max_smooth_depth = max(smooth_depth, na.rm = TRUE),
        min_region_size = min(region_size, na.rm = TRUE),
        mean_region_size = mean(region_size, na.rm = TRUE),
        q25_region_size = quantile(region_size, probs = 0.25, na.rm = TRUE),
        median_region_size = median(region_size, na.rm = TRUE),
        q75_region_size = quantile(region_size, probs = 0.75, na.rm = TRUE),
        max_region_size = max(region_size, na.rm = TRUE),
    )
cnv count min_norm_depth mean_norm_depth q25_norm_depth median_norm_depth q75_norm_depth max_norm_depth min_smooth_depth mean_smooth_depth q25_smooth_depth median_smooth_depth q75_smooth_depth max_smooth_depth min_region_size mean_region_size q25_region_size median_region_size q75_region_size max_region_size
deletion 55594 0 0.079619 0.00 0.01 0.09 7.45 0.00 0.0747786 0.00 0.03 0.12 0.39 3 12935.77 5475 9000 17000 121500
duplication 9409 0 6.624390 1.66 1.79 2.47 115.79 1.61 6.4827144 1.64 1.74 2.13 115.79 244 14108.14 3000 5867 11000 726500

There are a lot more deletions than duplications.

Repeats

Distribution of Repeat Fraction

Code
d <- ggplot(cnv_calls, aes(repeat_fraction, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1, scales = "free")+
            geom_histogram(binwidth = 0.01)+
            scale_x_continuous(labels = scales::comma)+
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Repeat Fraction of CNVs",
                subtitle = paste("Binwidth: 0.01"),
                y = "Count",
                x = "Repeat Fraction")
d 

In deletions, more of them have high repetitive fraction. In duplications, most of them have low repetitive fraction.

Depth

Distribution of Normalized Depth and Smooth Normalized Depth

The Normalized depth (norm_depth) is the median of the normalized mean depth of the windows that are part of the CNV region.

The Smooth normalized depth (smooth_depth) is the median of the smooth normalized mean depth of the windows that are part of the CNV region.

Code
d <- ggplot(cnv_calls, aes(norm_depth, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 0.1)+
            scale_y_log10(labels = scales::comma) +
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Normalized Depth",
            y = "Count (Log 10)",
            x = "Normalized Depth")
s <- ggplot(cnv_calls, aes(smooth_depth, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 0.1)+
            scale_y_log10(labels = scales::comma) +
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Smooth Normalized Depth",
            y = "Count (Log 10)",
            x = "Smooth Normalized Depth")

d | s

There are CNVs with extremely high normalized depth. I will truncate the normalized depth to 5.

Code
d <- ggplot(cnv_calls, aes(norm_depth, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 0.01)+
            geom_vline(xintercept = del_threshold)+
            geom_vline(xintercept = dup_threshold)+
            scale_y_log10(labels = scales::comma) +
            xlim(0,5)+
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Normalized Depth",
            subtitle = paste("Lines in depth threshold:", depth_threshold),
            y = "Count (Log 10)",
            x = "Normalized Depth (truncated)")
s <- ggplot(cnv_calls, aes(smooth_depth, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 0.01)+
            geom_vline(xintercept = del_threshold)+
            geom_vline(xintercept = dup_threshold)+
            scale_y_log10(labels = scales::comma) +
            xlim(0,5)+
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Smooth Normalized Depth",
            subtitle = paste("Lines in depth threshold:", depth_threshold),
            y = "Count (Log 10)",
            x = "Smooth Normalized Depth (truncated)")

d | s

Most deletions have depth (normalized and smooth) close to 0. Most duplications have depth (normalized and smooth) close to the depth threshold with which CNVs were called.

Normalized Depth vs. Smooth Normalized Depth

Code
p <- ggplot(cnv_calls, aes(x = norm_depth, y = smooth_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma) +
    theme_minimal() +
    labs(title = "Hexbin Plot of Normalized Depth vs Smooth Normalized Depth",
        subtitle = "Panels by interval of Repeat Fraction",
         x = "Normalized Depth",
         y = "Smooth Normalized Depth")

o <- ggplot(cnv_calls, aes(x = norm_depth, y = smooth_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_x_continuous(labels = scales::comma, limits = c(0,5)) +
    scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    theme_minimal() +
    labs(title = "Hexbin Plot of Normalized Depth vs Smooth Normalized Depth (Truncated axes)",
         subtitle = "Panels by interval of Repeat Fraction",
         x = "Normalized Depth",
         y = "Smooth Normalized Depth")
p / o

Since the CNVs are called using the smooth_depth, but it does not always coincide with the norm_depth, there are called CNVs in between the depth threshold used in the CNV calling. Most CNVs are in the diagonal where smooth depth is the same as normalized depth.

There are many duplications with extremely high depth that are in the same high interval of repetitive fraction.

There are more deletion in the high repetitive fraction interval. There are more duplications in the low repetitive fraction interval.

Size

Distribution of Sizes of CNVs

Code
d <- ggplot(cnv_calls, aes(region_size, fill = cnv)) +
            facet_wrap(~cnv, ncol = 1)+
            geom_histogram(binwidth = 1000)+
            scale_x_continuous(labels = scales::comma)+
            scale_y_log10(labels = scales::comma) +
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Histogram of Sizes of CNVs",
                y = "Count (Log10)",
                x = "Size of CNVs")
d 

Only duplication are larger than ~150kb.

Code
f <- ggplot(dup_calls_filtered, aes(y = region_size, x = length, color = chromosome)) +
            geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
            geom_point(alpha = 0.3)+
            scale_y_continuous(labels = scales::comma) +
            # facet_wrap(~lineage, ncol = 1)+
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Distribution of Size of\nFiltered dupCNVs by Length of Chromosome",
                y = "Size of CNVs",
                x = "Length of chromosome") 
u <- ggplot(dup_calls, aes(y = region_size, x = length, color = chromosome)) +
            geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
            geom_point(alpha = 0.3)+
            scale_y_continuous(labels = scales::comma) +
            # facet_wrap(~lineage, ncol = 1)+
            theme_minimal() +
            theme(legend.position = "none")+
            labs(title = "Distribution of Size of\nall dupCNVs by Length of Chromosome",
                y = "Size of CNVs",
                x = "Length of chromosome")
u | f

The dupCNVs are larger in smaller chromosomes.

Depth vs. Size

Code
p <- ggplot(cnv_calls, aes(x = region_size, y = smooth_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma) +
    theme_minimal() +
    labs(title = "Hexbin Plot of Smooth Normalized Depth vs CNV Size",
         subtitle = "Panels by interval of Repeat Fraction",
         x = "CNV Size",
         y = "Smooth Normalized Depth")

o <- ggplot(cnv_calls, aes(x = region_size, y = smooth_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    theme_minimal() +
    labs(title = "Hexbin Plot of Smooth Normalized Depth vs CNV Size (Truncated Y axis)",
         subtitle = "Panels by interval of Repeat Fraction",
         x = "CNV Size",
         y = "Smooth Normalized Depth")
p / o

There are many small CNVs in a high repetitive fraction interval that have extremely high depth.

Code
p <- ggplot(cnv_calls, aes(x = region_size, y = norm_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma) +
    theme_minimal() +
    labs(title = "Hexbin Plot of Normalized Depth vs CNV Size",
        subtitle = "Panels by interval of Repeat Fraction",
         x = "CNV Size",
         y = "Normalized Depth")

o <- ggplot(cnv_calls, aes(x = region_size, y = norm_depth)) +
    geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
    geom_hex(bins = 50) +
    facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
    scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
    scale_x_continuous(labels = scales::comma) +
    scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
    theme_minimal() +
    labs(title = "Hexbin Plot of Normalized Depth vs CNV Size (Truncated Y axis)",
        subtitle = "Panels by interval of Repeat Fraction",
         x = "CNV Size",
         y = "Normalized Depth")
p / o

Mostly, the CNVs with very high smooth_depth are small.

The CNVs whose norm_depth is in between the depth thresholds are small. (These are the ones that the norm_depth and smooth_depth don’t coincide).

Some dupCNVs are large and have very high depth (the ones that are neither in the vertical cluster of small CNVs and high depth or the horizontal cluster of large CNVs and depth around 2):

Code
filter(cnv_calls, region_size > 50000, smooth_depth > 2.5)%>%
  select(sample, lineage, chromosome, norm_depth, smooth_depth, region_size, repeat_fraction)
sample lineage chromosome norm_depth smooth_depth region_size repeat_fraction
SRS885892 VNBI chr04 4.39 4.51 212000 0.03
SRS417676 VNI chr04 2.83 2.84 52000 0.08
SRS417676 VNI chr09 3.20 3.46 190000 0.02
ERS2541126 VNI chr13 2.50 2.58 129000 0.12
ERS2541274 VNI chr13 2.53 2.58 97500 0.09
ERS542397 VNI chr04 2.76 2.76 704500 0.03
ERS2541331 VNI chr04 3.76 3.78 82000 0.01
ERS2541212 VNI chr13 2.66 2.64 129000 0.12
ERS542490 VNI chr04 3.23 3.24 344000 0.05
ERS2541064 VNI chr06 5.10 5.16 65000 0.02

The CNVs with the highest Normalized Depth (smooth or not), are highly repetitive.

There are many delCNVs completely repetitive. The majority of the dupCNVs are not repetitive.

Explore dupCNVs

Depth vs. Repeats per Chromosome

Distribution of repeat fraction in dupCNVs after filtering out very high depth groups of CNVs.

Code
n <- ggplot(dup_calls_filter_depth, aes(x = chromosome, y = norm_depth, color = chromosome)) +
       geom_quasirandom()+
       facet_wrap(~cut(repeat_fraction, breaks = 4), ncol = 1) +
        theme_minimal() +
        ylim(0,5)+
        theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
        labs(title = "Distribution of Normalized Depth of dupCNVs",
        subtitle = "Panels by interval of Repeat Fraction",
        y = "Normalized Depth",
        x = "Chromosome")
s <- ggplot(dup_calls_filter_depth, aes(x = chromosome, y = smooth_depth, color = chromosome)) +
       geom_quasirandom()+
       facet_wrap(~cut(repeat_fraction, breaks = 4), ncol = 1) +
        theme_minimal() +
        ylim(0,5)+
        theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
        labs(title = "Distribution of Smooth Normalized Depth of dupCNVs",
        subtitle = "Panels by interval of Repeat Fraction",
        y = "Smooth Normalized Depth",
        x = "Chromosome")
n | s

Distribution of repeat fraction in dupCNVs after filtering out very high depth groups of CNVs and CNVs with repeat fraction above repeat_treshold.

Code
n <- ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = chromosome)) +
       geom_quasirandom()+
        theme_minimal() +
        ylim(0,5)+
        theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
        labs(title = "Distribution of Normalized Depth of dupCNVs",
        y = "Normalized Depth",
        x = "Chromosome")
s <- ggplot(dup_calls_filtered, aes(x = chromosome, y = smooth_depth, color = chromosome)) +
       geom_quasirandom()+
        theme_minimal() +
        ylim(0,5)+
        theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
        labs(title = "Distribution of Smooth Normalized Depth of dupCNVs",
        y = "Smooth Normalized Depth",
        x = "Chromosome")
n | s

Explore depth of chromosomes

The Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized). The Normalized mean depth of chromosomes (norm_chrom_mean) is the normalized mean of the depth of the positions in the whole chromosome. (First the mean was calculated and then normalized).

Code
med <- ggplot(chrom_metrics)+
        geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
        scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
        theme_minimal() +
        labs(title = "Histogram of Normalized median depth of chromosomes",
                y = "Count",
                x = "Normalized median depth of chromosomes")
mea <- ggplot(chrom_metrics)+
        geom_histogram(aes(norm_chrom_mean), binwidth = 0.01)+
        scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
        theme_minimal() +
        labs(title = "Histogram of Normalized mean depth of chromosomes",
                y = "Count",
                x = "Normalized mean depth of chromosomes")

med / mea

Code
med <- ggplot(filter(chrom_metrics, norm_chrom_median > 1.2 ))+
        geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
        scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
        scale_y_continuous(
           breaks = seq(0, max(table(round(chrom_metrics$norm_chrom_median ))) , 1)
         )+
        theme_minimal() +
        theme(panel.grid.minor = element_blank())+
        labs(title = "Histogram of Normalized median depth of chromosomes",
                subtitle = "Only values above 1.2",
                y = "Count",
                x = "Normalized median depth of chromosomes")
mea <- ggplot(filter(chrom_metrics, norm_chrom_mean > 1.2 ))+
        geom_histogram(aes(norm_chrom_mean), binwidth = 0.01)+
        scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_mean), by = 0.1)) +
        #scale_y_continuous(breaks = seq(0, max(table(round(chrom_metrics$norm_chrom_mean ))) , 1))+
        theme_minimal() +
        theme(panel.grid.minor = element_blank())+
        labs(title = "Histogram of Normalized mean depth of chromosomes",
                subtitle = "Only values above 1.2",
                y = "Count",
                x = "Normalized mean depth of chromosomes")
med / mea

Chromosomes with very high median depth:

Code
chrom_metrics %>%
  select(lineage, sample, strain, chromosome, norm_chrom_median, norm_chrom_mean)%>%
  filter(norm_chrom_median > 2.2)%>%
  arrange(norm_chrom_median)
lineage sample strain chromosome norm_chrom_median norm_chrom_mean
VNI ERS1142697 20427_2#6 chr12 2.24 2.20
VNI ERS2541126 BMD3144 chr13 2.24 2.23
VNBI SRS885841 NRHc5009.REL.INI chr14 2.25 2.23
VNI SRS404481 Bt139 chr13 2.27 2.20
VNI SRS885916 PMHc1031A.ENR.INI.LP chr12 2.31 2.34
VNI ERS2541212 04CN-03-039 chr13 2.45 2.39
VNBII SRS881180 PMHc1029.ENR.STOR chr12 2.45 2.42
VNI ERS542397 14936_1#6 chr04 2.74 2.34

Number of dupCNVs per Chromosome

Code
u <- ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = length, y = n_cnvs, color = chromosome))+
    geom_point(alpha = 0.1)+
    geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
    theme_minimal() +
    theme(legend.position = "none")+
    labs(title = "Number of dupCNVs\nper Chromosome by Length",
            y = "Number of dupCNVs",
            x = "Length of Chromosome (bp)")
f <- ggplot(filter(chrom_metrics_filtered, n_cnvs > 0), aes(x = length, y = n_cnvs, color = chromosome))+
    geom_point(alpha = 0.1)+
    geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
    theme_minimal() +
    labs(title = "Number of filtered dupCNVs\nper Chromosome by Length",
            y = "Number of dupCNVs",
            x = "Length of Chromosome (bp)")
u | f

Larger chromosomes have more dupCNVs

Percentage of chromosome Covered by dupCNVs

Code
n_per_chrom <- chrom_metrics %>%
                    filter(n_cnvs > 0)%>%
                    group_by(chromosome, lineage, length)%>%
                    summarize(n = n())
n_per_chrom_filtered <- chrom_metrics_filtered %>%
                    filter(n_cnvs > 0)%>%
                    group_by(chromosome, lineage, length)%>%
                    summarize(n = n())

u <- ggplot(n_per_chrom, aes(x = length, y = n, color = chromosome))+
    geom_col()+
    geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
    theme_minimal() +
    theme(legend.position = "none")+
    labs(title = "Number of Chromosomes with CNVs by Length",
            y = "Number of Chromosomes",
            x = "Length of Chromosome (bp)")
f <- ggplot(n_per_chrom_filtered, aes(x = length, y = n, color = chromosome))+
    geom_col()+
    geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
    theme_minimal() +
    labs(title = "Number of Chromosomes with filtered CNVs by Length",
            y = "Number of Chromosomes",
            x = "Length of Chromosome (bp)")
u | f

Code
ggplot(filter(chrom_metrics, n_cnvs > 0))+
        geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent, color = chromosome), alpha = 0.1)+
        theme_minimal() +
        labs(title = "Percent of Chromosome Covered by dupCNVs per Chromosome",
                subtitle = "Only chromosomes with at least 1 dupCNV",
                y = "Percent of Chromosome Covered by dupCNVs",
                x = "Chromosome")

Code
p <- ggplot(filter(chrom_metrics, dup_coverage_percent > 10))+
        geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent))+
        theme_minimal() +
        labs(title = "Percent of Chromosome Covered by dupCNVs per Chromosome",
                subtitle = "Only chromosomes with at least 10% covered",
                x = "Chromosome",
                y = "Percent of Chromosome Covered by dupCNVs")
p

Code
p <- ggplot(filter(chrom_metrics, n_cnvs > 0))+
        geom_quasirandom(aes(x = chromosome, y = dup_span_percent))+
        theme_minimal() +
        labs(title = "Percent of Chromosome Spanned by dupCNVs per Chromosome",
                subtitle = "Only chromosomes with at least 1 dupCNV",
                y = "Percent of Chromosome Spanned by dupCNVs",
                x = "Chromosome")
p

Code
p <- ggplot(filter(chrom_metrics, dup_coverage_percent > 10))+
        geom_quasirandom(aes(x = chromosome, y = dup_span_percent))+
        theme_minimal() +
        labs(title = "Percent of Chromosome Spanned by dupCNVs per Chromosome",
                subtitle = "Only chromosomes with at least 10% spanned",
                y = "Percent of Chromosome Spanned by dupCNVs",
                x = "Chromosome")
ggMarginal(p, type = "histogram")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = chromosome))+
        geom_boxplot()+
        geom_hline(yintercept =1, color = "black", linetype = "solid")+
        geom_hline(yintercept =2, color = "black", linetype = "solid")+
        theme_minimal() +
        labs(title = "Histogram of Normalized median depth of chromosomes",
                y = "Count",
                x = "Normalized median depth of chromosomes")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = chromosome, color = lineage))+
        geom_boxplot()+
        geom_hline(yintercept =1, color = "black", linetype = "solid")+
        geom_hline(yintercept =2, color = "black", linetype = "solid")+
        theme_minimal() +
        labs(title = "Histogram of Normalized median depth of chromosomes",
                y = "Count",
                x = "Normalized median depth of chromosomes")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = length, color = chromosome))+
    geom_point(alpha = 0.05)+
    facet_wrap(~lineage, ncol = 1)+
    geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
    # geom_hline(yintercept =1, color = "black", linetype = "solid")+
    # geom_hline(yintercept =2, color = "black", linetype = "solid")+
    theme_minimal() +
    labs(title = "Histogram of Normalized median depth of chromosomes",
        y = "Normalized median depth of chromosomes",
        x = "Chromosome length")

Code
ggplot(chrom_metrics, aes(y = norm_chrom_median, x = length, color = chromosome))+
    geom_point(alpha = 0.05)+
    geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
    geom_hline(yintercept =1, color = "black", linetype = "solid")+
    geom_hline(yintercept =2, color = "black", linetype = "solid")+
    theme_minimal() +
    labs(title = "Histogram of Normalized median depth of chromosomes",
        y = "Normalized median depth of chromosomes",
        x = "Chromosome length")

Code
ggplot(filter(chrom_metrics, lineage == "VNI"), aes(y = norm_chrom_mean, x = length, color = chromosome))+
        geom_boxplot()+
        # geom_hline(yintercept =1, color = "black", linetype = "solid")+
        # geom_hline(yintercept =2, color = "black", linetype = "solid")+
        theme_classic() +
        labs(title = "Histogram of Normalized median depth of chromosomes",
                y = "Count",
                x = "Normalized median depth of chromosomes")

Code
ggplot(filter(chrom_metrics, lineage == "VNI"), aes(y = norm_chrom_mean, x = length))+
        geom_hex(bins = 50)+
        geom_hline(yintercept =1, color = "black", linetype = "solid")+
        geom_hline(yintercept =2, color = "black", linetype = "solid")+
        scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +

        theme_classic() +
        labs(title = "Histogram of Normalized median depth of chromosomes",
                y = "Count",
                x = "Normalized median depth of chromosomes")

Explore relationship between CNV chromosomal metrics

Code
ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_hex(bins=50) +
        scale_x_continuous(labels = scales::comma) +
        geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
        scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Spanned by dupCNVs vs Covered by dupCNVs",
            y = "Percent of Chromosome Spanned by dupCNVs",
            x = "Percent of Chromosome Covered by dupCNVs")

Code
ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = dup_coverage_percent, y = n_cnvs)) +
        geom_hex(bins=50) +
        scale_x_continuous(labels = scales::comma) +
        scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Number of CNVs vs Percent of Chromosome Covered by dupCNVs ",
            y = "Number of CNVs",
            x = "Percent of Chromosome Covered by dupCNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = n_cnvs)) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = norm_chrom_median)) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_viridis_c(name = "Normalized Median\nDepth of Chromosome", direction = -1) +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = norm_chrom_mean)) +
        facet_wrap(~cut(norm_chrom_median, breaks = 6), nrow = 2) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_viridis_c(name = "Normalized Mean\nDepth of Chromosome", direction = -1) +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Compare metrics of dupCNVs with chromosome depth

The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of each dupCNV region.

The Median depth of dupCNV (median_cnv_depth) is the median of the Normalized depth of all dupCNVs in the chromosome. There are values bellow the threshold of depth to call a Duplication CNV because that threshold was applied to the smoothed values and this are the raw values.

The Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized).

The Percent of Chromosome Covered by dupCNVs (dup_coverage_percent) is the percentage of the full length of the chromosome that is part of called dupCNVs.

The Percent of Chromosome Spanned by dupCNVs (dup_span_percent) is the percentage of the full length of the chromosome that is in between the leftmost and rightmost dupCNVs.

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_hex(bins = 50)+
        scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by dupCNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_mean)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_hex(bins = 50)+
        scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
             y = "Normalized Mean Depth of Chromosome",
             x = "Percent of Chromosome Covered by dupCNVs")

Code
color_range <- paste("2.6 -", max(chrom_metrics$median_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median dupCNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by dupCNVs")

Code
color_range <- paste("2.6 -", max(chrom_metrics$mean_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_mean)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(mean_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Mean dupCNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
             y = "Normalized Mean Depth of Chromosome",
             x = "Percent of Chromosome Covered by dupCNVs")